This document was updated on 2017-06-21.
library(stringr)
library(readr)
library(dplyr)
library(ggvis)
library(ISLR)
c() combines its inputs into a vectorx <- c(1,3,2,5)
x
## [1] 1 3 2 5
= symbol works tooy = c(1,4,3)
y
## [1] 1 4 3
?c to get help with the c() or any function.?length to see what the length function does. Or any function.x <- c(1,6,2)
y <- c(1,4,3)
length(x)
## [1] 3
length(y)
## [1] 3
x + y
## [1] 2 10 5
ls() allows you to look at a list of all objects.rm() is used to delete objects
ls()
## [1] "x" "y"
rm(x,y)
ls()
## character(0)
rm(list=ls())
matrix() will create a matrix
x = matrix(data=c(1,2,3,4), nrow = 2, ncol = 2)
x
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
x = matrix(c(1,2,3,4), 2, 2, byrow=T)
x
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
sqrt() will work on each element of the matrixx^2 will square each element of the matrixsqrt(x)
## [,1] [,2]
## [1,] 1.000000 1.414214
## [2,] 1.732051 2.000000
x^2
## [,1] [,2]
## [1,] 1 4
## [2,] 9 16
rnorm creates a vetor or random values sampled from the normal distributioncor will compute the correlation between two vectorsx = rnorm(50)
y = x + rnorm(50, mean=50, sd = .1)
cor(x,y)
## [1] 0.9931957
set.seed will allow you to create the exact same set of random numbersset.seed(1303)
rnorm(50)
## [1] -1.1439763145 1.3421293656 2.1853904757 0.5363925179 0.0631929665
## [6] 0.5022344825 -0.0004167247 0.5658198405 -0.5725226890 -1.1102250073
## [11] -0.0486871234 -0.6956562176 0.8289174803 0.2066528551 -0.2356745091
## [16] -0.5563104914 -0.3647543571 0.8623550343 -0.6307715354 0.3136021252
## [21] -0.9314953177 0.8238676185 0.5233707021 0.7069214120 0.4202043256
## [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
## [31] 1.5732737361 0.0127465055 0.8726470499 0.4220661905 -0.0188157917
## [36] 2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412 1.3677342065
## [41] 0.2640073322 0.6321868074 -1.3306509858 0.0268888182 1.0406363208
## [46] 1.3120237985 -0.0300020767 -0.2500257125 0.0234144857 1.6598706557
mean calculates the mean of a vector of numbersvar calculates the variance of a vector of numbersset.seed(3)
y = rnorm(100)
mean(y)
## [1] 0.01103557
var(y)
## [1] 0.7328675
# calculate standard deviation
sqrt(var(y))
## [1] 0.8560768
sd(y)
## [1] 0.8560768
x = rnorm(100)
y = rnorm(100)
plot(x,y)
plot(x,y,
xlab = "this is the x-axis",
ylab = "this is the y-axis",
main = "Plot of X vs Y"
)
data.frame(x=rnorm(100), y=rnorm(100)) %>%
ggvis(~x, ~y) %>%
layer_points(fill := "blue")
pdf() or jpeg()system lets you can run a system command on the computer
dev.off says we are done creating the plotsystem("mkdir images")
pdf("images/Figure.pdf")
plot(x,y,col="green")
dev.off()
## quartz_off_screen
## 2
?seq creates a sequence of numbersx = seq(1:10)
x
## [1] 1 2 3 4 5 6 7 8 9 10
## This is a shortcut
x = 1:10
x
## [1] 1 2 3 4 5 6 7 8 9 10
x = seq(-pi, pi, length=20)
x
## [1] -3.1415927 -2.8108987 -2.4802047 -2.1495108 -1.8188168 -1.4881228
## [7] -1.1574289 -0.8267349 -0.4960409 -0.1653470 0.1653470 0.4960409
## [13] 0.8267349 1.1574289 1.4881228 1.8188168 2.1495108 2.4802047
## [19] 2.8108987 3.1415927
contour creates a contour plot of 3 dimensional data (like topographic map)y = x
f = outer(x, y, function(x,y) cos(y)/(1+x^2))
contour(x,y,f)
contour(x,y,f,nlevels=45, add=T)
fa = (f-t(f))/2
contour(x,y,fa, nlevels=15)
image works like contour, but it uses colors like a heatmappersp creates a 3D plot. THeta and phi control the viewing angleimage(x,y,fa)
persp(x,y,fa)
persp(x,y,fa, theta=30)
persp(x,y,fa, theta=30, phi=20)
persp(x,y,fa, theta=30, phi=70)
persp(x,y,fa, theta=30, phi=40)
A = matrix(1:16,4,4)
A
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
A[2,3]
## [1] 10
A[c(1,3), c(2,4)]
## [,1] [,2]
## [1,] 5 13
## [2,] 7 15
A[1:3,2:4]
## [,1] [,2] [,3]
## [1,] 5 9 13
## [2,] 6 10 14
## [3,] 7 11 15
A[1:2,]
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
A[,1:2]
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
A[1,]
## [1] 1 5 9 13
A[-c(1,3),]
## [,1] [,2] [,3] [,4]
## [1,] 2 6 10 14
## [2,] 4 8 12 16
dim(A)
## [1] 4 4
df <- data.frame(a=1:4, b=5:8, c=9:12, d=13:16)
df
## a b c d
## 1 1 5 9 13
## 2 2 6 10 14
## 3 3 7 11 15
## 4 4 8 12 16
df[2,3]
## [1] 10
df[c(1,3), c(2,4)]
## b d
## 1 5 13
## 3 7 15
df[1:3,2:4]
## b c d
## 1 5 9 13
## 2 6 10 14
## 3 7 11 15
df[1:2,]
## a b c d
## 1 1 5 9 13
## 2 2 6 10 14
df[1:2] ## it defaults to the columns if you do not put in the ,
## a b
## 1 1 5
## 2 2 6
## 3 3 7
## 4 4 8
df[,1:2]
## a b
## 1 1 5
## 2 2 6
## 3 3 7
## 4 4 8
df[1,]
## a b c d
## 1 1 5 9 13
df[-c(1,3),]
## a b c d
## 2 2 6 10 14
## 4 4 8 12 16
dim(df)
## [1] 4 4
df$a
## [1] 1 2 3 4
df[c('a','b')]
## a b
## 1 1 5
## 2 2 6
## 3 3 7
## 4 4 8
df[1:2, c('a','b')]
## a b
## 1 1 5
## 2 2 6
df[c(1,4), c('a','d')]
## a d
## 1 1 13
## 4 4 16
fix opens a spreadsheet viewer but its really slow
read_csv is the kef function. It has many arguments to customize loading the data, but its defaults do a great job and make life easy. In most cases it does exactly what you would want.## This just needs to be run the first time
# system("mkdir data")
# write_csv(Auto, "data/Auto.csv")
df_auto <- read_csv("data/Auto.csv")
str(df_auto)
## Classes 'tbl_df', 'tbl' and 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 9
## .. ..$ mpg : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ cylinders : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ displacement: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ horsepower : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ weight : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ acceleration: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ origin : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ name : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
glimpse(df_auto)
## Observations: 392
## Variables: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 1...
## $ cylinders <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6...
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390,...
## $ horsepower <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190,...
## $ weight <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4...
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10....
## $ year <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7...
## $ origin <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1...
## $ name <chr> "chevrolet chevelle malibu", "buick skylark 320",...
head(df_auto)
## # A tibble: 6 x 9
## mpg cylinders displacement horsepower weight acceleration year
## <dbl> <int> <dbl> <int> <dbl> <dbl> <int>
## 1 18 8 307 130 3504 12.0 70
## 2 15 8 350 165 3693 11.5 70
## 3 18 8 318 150 3436 11.0 70
## 4 16 8 304 150 3433 12.0 70
## 5 17 8 302 140 3449 10.5 70
## 6 15 8 429 198 4341 10.0 70
## # ... with 2 more variables: origin <int>, name <chr>
names(df_auto)
## [1] "mpg" "cylinders" "displacement" "horsepower"
## [5] "weight" "acceleration" "year" "origin"
## [9] "name"
plot(df_auto$cylinders, df_auto$mpg)
ggvis library for plots or rcharts if I want javascript charts.
attach because dplyr and ggvis let me reference the column name directlydf_auto %>%
ggvis(~cylinders, ~mpg) %>%
layer_points(
fill := "green",
fillOpacity := .4
)
varwidth=T makes the box width based on the number of data points in the group I think
df_auto$cylinders <- as.factor(df_auto$cylinders)
plot(df_auto$cylinders, df_auto$mpg)
plot(df_auto$cylinders, df_auto$mpg, col = 'red', varwidth=T, horizontal=T)
plot(df_auto$cylinders, df_auto$mpg, col = 'red', varwidth=T, xlab="cylinders", ylab = "MPG")
df_auto %>%
ggvis(~cylinders, ~mpg) %>%
layer_boxplots(
width = .9,
fill := "green",
fillOpacity := .4
)
hist(df_auto$mpg)
hist(df_auto$mpg, col=2)
hist(df_auto$mpg, col=2, breaks=15)
attach(df_auto)
pairs(Auto)
pairs(~ mpg + displacement + horsepower + weight + acceleration)
identify function to work
plot(horsepower, mpg)
identify(horsepower, mpg, name)
## integer(0)
add_tooltip function.all_values <- function(x) {
if(is.null(x)) return(NULL)
str_c(names(x), ": ", format(x), collapse = '<br>')
}
df_auto %>%
ggvis(~horsepower, ~mpg) %>%
layer_points(
fill := "green",
fillOpacity := .6
) %>%
add_tooltip(all_values, "hover")